from collections import defaultdict
import numpy as np
import sys

#function to make list of read names and list of sequence reads in the same order
def get_reads(fn):
	seq_list = []
	rdname_list = []
	for i, line in enumerate(fn):
		line = line.strip()
		if i % 4 == 0:
			rdname_list.append(line)
		elif i % 4 == 1:
			seq_list.append(line)
	return seq_list, rdname_list

#function to separate reads into tag groups using dictionary, checks if not in dictionary, adds it too dictionary or adds it to group
def rdnames_in_taggrps(listm,listn):
	tagseq_to_taggrp = {}
	rdname_to_taggrp = {}
	counter = 0
	for i, read in enumerate(listm):
		rdname_plus = read.split(" ")[0]
		rdname = str(rdname_plus[1:])
		tagseq = listn[i][:12]
	
		if tagseq not in tagseq_to_taggrp:
			tagseq_to_taggrp[tagseq] = counter
			counter += 1
		taggrp = tagseq_to_taggrp[tagseq]
		rdname_to_taggrp[rdname] = taggrp
	return rdname_to_taggrp
#		count_by_id[taggrp]+=1


my_read_file = open(sys.argv[3], "r")
	


seq_list, rdname_list = get_reads(my_read_file)
rdname_to_taggrp = rdnames_in_taggrps(rdname_list,seq_list)

#these should be the same
assert len(rdname_list), len(rdname_to_taggrp.keys())


myfile = open(sys.argv[1], "r")
outfile = open(sys.argv[2], "w")	
outfile_raw = open("%s.raw"%(sys.argv[2]), "w")	

index = 1

#gets rid of first 100 lines of sam file (for 100 loci, there will be more of you have more targets and more synthetic references)
while index <=100:
	myfile.readline()
	index +=1

master_file = open("~MIP_data_and_analysis/master_for_calling.csv", "r")
header = master_file.readline()
master_key = master_file.readlines()
MIP_dict={}
for line in master_key:
	info = line.strip()
	info_list = info.split(",")
	MIP_dict[info_list[0]] = int(info_list[1]),int(info_list[2])

hists_by_tag = {}

newfile = myfile.readlines()

for item in newfile:
	line = item
	splititem = line.split("\t")
	if splititem[2] == "*": continue
	else:
		getnumber = splititem[2].split("_")		
		getscore = splititem[11].split(":")
		number = int(getnumber[-1])
		score = int(getscore[2])
		readname = str(splititem[0])
		group = rdname_to_taggrp[readname]
		#if read ends in the STR, unit number will be reported as negative number
		if MIP_dict[sys.argv[4]][1] + number*MIP_dict[sys.argv[4]][0] >= 240:
			number = -number
		#we only consider alignment scores A >= 180	
		if score >= 180:
			if not group in hists_by_tag:
				hists_by_tag[group] = {}
			if not number in hists_by_tag[group]:
				hists_by_tag[group][number] = 0
			
			hists_by_tag[group][number]+=1

outfile.write("tag\tcount\tcp_number\ttotal_count\n")
outfile_raw.write("tag\tcp_number\ttotal_count\n")

			
			
			